Library
library(tidyverse)
library(here)
library(readxl)
library(janitor)
library(stringr)
library(tidyr)
library(dplyr)
library(knitr)
library(ggplot2)
library(factoextra)
library(gt)
library(naniar)
library(ggcorrplot)
library(corrplot)
library(GGally)
library(randomForest)
library(e1071)
library(caret)
library(plotly)
library(stargazer)
library(broom)
library(kableExtra)
library(gridExtra)
library(patchwork)
library(minerva)
library(mgcv)
library(MASS)
library(glmnet)
library(purrr)
library(insight)
library(grid)
Data Import
raw_bp <- read.csv(here("ZIPallFiles","AHSnpa11bp.csv"), header=TRUE)
raw_bb <- read.csv(here("ZIPallFiles","AHSnpa11bb.csv"), header=TRUE)
raw_bsp <- read.csv(here("ZIPallFiles","inp13bsp.csv"), header=TRUE)
raw_bcn <- read.csv(here("ZIPallFiles","inp13bcn.csv"), header=TRUE)
raw_bhh <- read.csv(here("ZIPallFiles","inp13bhh.csv"), header=TRUE)
Rename and Created
bp_bb <- inner_join(raw_bp, raw_bb, by = "ABSPID") %>%
rename(HypertensionStatus = HYPBC,
BodyMass = SABDYMS,
SocialStatus = SF2SA1QN,
PersonID = ABSPID,
Age = AGEC,
Gender = SEX,
BMI = BMISC,
Systolic = SYSTOL,
Diastolic = DIASTOL,
SmokeStatus = SMKSTAT,
AlcoholPercentage_Day1 = ALCPER1,
AlcoholPercentage_Day2 = ALCPER2,
Potassium_Day1 = POTAST1,
Potassium_Day2 = POTAST2,
Sodium_Day1 = SODIUMT1,
Sodium_Day2 = SODIUMT2,
FibreEnergy_Day1 = FIBRPER1,
FibreEnergy_Day2 = FIBRPER2,
CarbohydrateEnergy_Day1 = CHOPER1,
CarbohydrateEnergy_Day2 = CHOPER2,
EnergyBMR_Day1 = EIBMR1,
EnergyBMR_Day2 = EIBMR2) %>%
mutate(Identity = factor(0))
bsp_bhh<- left_join(raw_bsp, raw_bhh,by = c("ABSHID")) %>%
rename(BodyMass = SABDYMS,
SocialStatus = SF2SA1DB,
HouseID = ABSHID,
Age = AGEEC,
Gender = SEX,
BMI = BMISC,
Systolic = SYSTOL,
Diastolic = DIASTOL,
SmokeStatus = SMKSTAT,
AlcoholPercentage_Day1 = ALCPER1,
AlcoholPercentage_Day2 = ALCPER2,
Potassium_Day1 = POTAST1,
Potassium_Day2 = POTAST2,
Sodium_Day1 = SODIUMT1,
Sodium_Day2 = SODIUMT2,
FibreEnergy_Day1 = FIBRPER1,
FibreEnergy_Day2 = FIBRPER2,
CarbohydrateEnergy_Day1 = CHOPER1,
CarbohydrateEnergy_Day2 = CHOPER2,
EnergyBMR_Day1 = EIBMR1,
EnergyBMR_Day2 = EIBMR2)%>%
mutate(Identity = factor(1))
bcn <- raw_bcn %>%
rename(HouseID = ABSHID,
MedicalCondition = ICD10ME)
First Filter
bp_bb <- bp_bb %>%
mutate(EnergyBMR_Day1 = na_if(EnergyBMR_Day1, 997) %>% na_if(998),
EnergyBMR_Day2 = na_if(EnergyBMR_Day2, 997) %>% na_if(998),
EnergyBMR = (EnergyBMR_Day1 + EnergyBMR_Day2)/2) %>%
filter(
Age >= 18,
HypertensionStatus == 5,
BodyMass != 4,
(is.na(EnergyBMR) | EnergyBMR >= 0.9))
bsp_bhh <- bsp_bhh %>%
mutate(EnergyBMR_Day1 = na_if(EnergyBMR_Day1, 997) %>% na_if(998),
EnergyBMR_Day2 = na_if(EnergyBMR_Day2, 997) %>% na_if(998),
EnergyBMR = (EnergyBMR_Day1 + EnergyBMR_Day2)/2) %>%
filter(
Age >= 18,
BodyMass != 4,
!HouseID %in% bcn$HouseID[bcn$MedicalCondition %in% c(7, 20)],
(is.na(EnergyBMR) | EnergyBMR >= 0.9))
Merge
raw_data <- bind_rows(bp_bb,bsp_bhh)
Variable Selection
selected_data <- raw_data %>%
dplyr::select(PersonID, HouseID, SocialStatus, Age, Gender, BMI, Systolic, Diastolic, SmokeStatus, Identity,
AlcoholPercentage_Day1, AlcoholPercentage_Day2,
Potassium_Day1, Potassium_Day2, Sodium_Day1, Sodium_Day2,
FibreEnergy_Day1, FibreEnergy_Day2,
CarbohydrateEnergy_Day1, CarbohydrateEnergy_Day2)
Type Check 1
data.frame(Variable = names(selected_data), Type = sapply(selected_data, class)) %>%
gt() %>%
tab_header(
title = "Variable Names and Their Types"
) %>%
tab_caption(caption = md("Variable types table"))
Variable types table
| Variable Names and Their Types |
| Variable |
Type |
| PersonID |
character |
| HouseID |
character |
| SocialStatus |
integer |
| Age |
integer |
| Gender |
integer |
| BMI |
numeric |
| Systolic |
integer |
| Diastolic |
integer |
| SmokeStatus |
integer |
| Identity |
factor |
| AlcoholPercentage_Day1 |
numeric |
| AlcoholPercentage_Day2 |
numeric |
| Potassium_Day1 |
numeric |
| Potassium_Day2 |
numeric |
| Sodium_Day1 |
numeric |
| Sodium_Day2 |
numeric |
| FibreEnergy_Day1 |
numeric |
| FibreEnergy_Day2 |
numeric |
| CarbohydrateEnergy_Day1 |
numeric |
| CarbohydrateEnergy_Day2 |
numeric |
Na values + New Variables + Type Conversion
selected_data <- selected_data %>%
mutate(
across(c(Gender, SocialStatus, SmokeStatus), as.factor),
BMI = na_if(BMI, 98) %>% na_if(99),
Systolic = na_if(Systolic, 0) %>% na_if(998) %>% na_if(999),
Diastolic = na_if(Diastolic, 0) %>% na_if(998) %>% na_if(999),
Potassium = (Potassium_Day1 + Potassium_Day2) / 2,
Sodium = (Sodium_Day1 + Sodium_Day2) / 2,
FibreEnergy = (FibreEnergy_Day1 + FibreEnergy_Day2)/2,
CarbohydrateEnergy = (CarbohydrateEnergy_Day1 + CarbohydrateEnergy_Day2) / 2,
AlcoholPercentage = (AlcoholPercentage_Day1 + AlcoholPercentage_Day2) /2,
SodiumPotassiumRatio = Sodium / Potassium
) %>%
dplyr::select(PersonID,HouseID, SocialStatus, Age, Gender, BMI, Systolic, Diastolic, SmokeStatus, AlcoholPercentage, SodiumPotassiumRatio, FibreEnergy, CarbohydrateEnergy, Identity)
Type Check 2
data.frame(Variable = names(selected_data), Type = sapply(selected_data, class)) %>%
gt() %>%
tab_header(
title = "Variable Names and Their Types"
) %>%
tab_caption(caption = md("Variable types table after correction"))
Variable types table after correction
| Variable Names and Their Types |
| Variable |
Type |
| PersonID |
character |
| HouseID |
character |
| SocialStatus |
factor |
| Age |
integer |
| Gender |
factor |
| BMI |
numeric |
| Systolic |
integer |
| Diastolic |
integer |
| SmokeStatus |
factor |
| AlcoholPercentage |
numeric |
| SodiumPotassiumRatio |
numeric |
| FibreEnergy |
numeric |
| CarbohydrateEnergy |
numeric |
| Identity |
factor |
Duplicate Values
selected_data %>%
dplyr::select(-PersonID) %>%
distinct() %>%
dim() %>%
tibble::tibble(Dimension = c("Rows", "Columns"), Count = .) %>%
gt() %>%
tab_header(title = "Dimensions of Unique Tibble") %>%
tab_caption(caption = md("Dimensions of Unique Data Table"))
Dimensions of Unique Data Table
| Dimensions of Unique Tibble |
| Dimension |
Count |
| Rows |
8449 |
| Columns |
13 |
Merge Variable
selected_data <- selected_data %>%
mutate(SmokeStatus = recode_factor(SmokeStatus, "2" = "1", "3" = "1","4" = "2","5" = "3"))
Outliers and New Variables
normal_ranges <- list(Systolic = c(90, 180), Diastolic = c(60, 120), BMI = c(16, 47.5),
SodiumPotassiumRatio = c(0, 4), AlcoholPercentage = c(0, 85),
CarbohydrateEnergy = c(0, 100), FibreEnergy = c(0, 100))
selected_data <- selected_data %>%
filter(
(is.na(Systolic) | (Systolic >= normal_ranges$Systolic[1] & Systolic <= normal_ranges$Systolic[2])),
(is.na(Diastolic) | (Diastolic >= normal_ranges$Diastolic[1] & Diastolic <= normal_ranges$Diastolic[2])),
(is.na(BMI) | (BMI >= normal_ranges$BMI[1] & BMI <= normal_ranges$BMI[2])),
(is.na(AlcoholPercentage) | (AlcoholPercentage >= normal_ranges$AlcoholPercentage[1] & AlcoholPercentage <= normal_ranges$AlcoholPercentage[2])),
(is.na(CarbohydrateEnergy) | (CarbohydrateEnergy >= normal_ranges$CarbohydrateEnergy[1] & CarbohydrateEnergy <= normal_ranges$CarbohydrateEnergy[2])),
(is.na(FibreEnergy) | (FibreEnergy >= normal_ranges$FibreEnergy[1] & FibreEnergy <= normal_ranges$FibreEnergy[2])),
(is.na(SodiumPotassiumRatio) | (SodiumPotassiumRatio >= normal_ranges$SodiumPotassiumRatio[1] & SodiumPotassiumRatio <= normal_ranges$SodiumPotassiumRatio[2]))
) %>%
mutate(Hypertension = as.factor(case_when(Systolic >= 120 | Diastolic >= 80 ~ 1,TRUE ~ 0)))
Variable check
variable_data <- data.frame(
Variable = c("PersonID", "HouseID", "SocialStatus", "Age", "Gender", "BMI",
"Systolic", "Diastolic", "SmokeStatus", "AlcoholPercentage",
"SodiumPotassiumRatio", "FibreEnergy", "CarbohydrateEnergy",
"Identity"),
Description = c("Selected person identifier", "Household identifier",
"ICD10 Conditions - Mini classification",
"Age of person", "Sex of person",
"Body Mass Index (BMI) - score measured",
"Systolic Blood Pressure (mmHg)",
"Diastolic Blood Pressure (mmHg)", "Smoking Status",
"Average of Percentage of energy from alcohol (Day1) and Percentage of energy from alcohol (Day2)",
"The ratio of the average of Sodium (supplement) Day 1 mg and Sodium (supplement) Day 2 mg over the average of Potassium (total) Day 1 mg and Potassium (total) Day 2 mg",
"Average of Percentage of energy from fibre (Day1) and Percentage of energy from fibre (Day2)",
"Average of Percentage of energy from carbohydrate (Day1) and Percentage of energy from carbohydrate (Day2)",
"Indicates whether the individual is part of the indigenous population"))
variable_table <- variable_data %>%
gt() %>%
tab_header(title = "All related variables") %>%
cols_label(Variable = "Variable",Description = "Description") %>%
tab_caption(caption = md("Variable Descriptions"))%>%
fmt_markdown(columns = everything()) %>%
tab_options(table.width = pct(100))
variable_table
Variable Descriptions
| All related variables |
| Variable |
Description |
| PersonID |
Selected person identifier |
| HouseID |
Household identifier |
| SocialStatus |
ICD10 Conditions - Mini classification |
| Age |
Age of person |
| Gender |
Sex of person |
| BMI |
Body Mass Index (BMI) - score measured |
| Systolic |
Systolic Blood Pressure (mmHg) |
| Diastolic |
Diastolic Blood Pressure (mmHg) |
| SmokeStatus |
Smoking Status |
| AlcoholPercentage |
Average of Percentage of energy from alcohol (Day1) and Percentage of energy from alcohol (Day2) |
| SodiumPotassiumRatio |
The ratio of the average of Sodium (supplement) Day 1 mg and Sodium (supplement) Day 2 mg over the average of Potassium (total) Day 1 mg and Potassium (total) Day 2 mg |
| FibreEnergy |
Average of Percentage of energy from fibre (Day1) and Percentage of energy from fibre (Day2) |
| CarbohydrateEnergy |
Average of Percentage of energy from carbohydrate (Day1) and Percentage of energy from carbohydrate (Day2) |
| Identity |
Indicates whether the individual is part of the indigenous population |
This table may require table number as it may be placed in
Methodology.
selected_predictors <- selected_data %>%
dplyr::select(FibreEnergy, CarbohydrateEnergy)
predictor_table <- data.frame(Variable = names(selected_predictors), Type = sapply(selected_predictors, class)) %>%
gt() %>%
tab_header(title = "Variable Names and Their Types") %>%
tab_caption(caption = md("Predictor variables and their data types"))
predictor_table
Predictor variables and their data types
| Variable Names and Their Types |
| Variable |
Type |
| FibreEnergy |
numeric |
| CarbohydrateEnergy |
numeric |
confounding_vars <- selected_data %>%
dplyr::select(-FibreEnergy, -CarbohydrateEnergy,-PersonID, -HouseID)
confounding_table <- data.frame(Variable = names(confounding_vars), Type = sapply(confounding_vars, class)) %>%
gt() %>%
tab_header(title = "Confounding Variables and Their Types") %>%
tab_caption(caption = md("Confounding variables and their data types"))
confounding_table
Confounding variables and their data types
| Confounding Variables and Their Types |
| Variable |
Type |
| SocialStatus |
factor |
| Age |
integer |
| Gender |
factor |
| BMI |
numeric |
| Systolic |
integer |
| Diastolic |
integer |
| SmokeStatus |
factor |
| AlcoholPercentage |
numeric |
| SodiumPotassiumRatio |
numeric |
| Identity |
factor |
| Hypertension |
factor |
format_counts <- function(count, total) {
paste(count, "(", round(count / total * 100, 1), "%)", sep = "")
}
format_counts_list <- function(values, data, column) {
sapply(values, function(x) format_counts(sum(data[[column]] == x), nrow(data)))
}
format_mean_sd <- function(mean_val, sd_val) {
paste(round(mean_val, 2), "(", round(sd_val, 2), ")", sep = "")
}
format_mean_sd_list <- function(data, columns) {
sapply(columns, function(col) format_mean_sd(mean(data[[col]], na.rm = TRUE), sd(data[[col]], na.rm = TRUE)))
}
get_social_status <- function(data) {
format_counts_list(1:5, data, "SocialStatus")
}
get_gender <- function(data) {
format_counts_list(1:2, data, "Gender")
}
get_smoke_status <- function(data) {
format_counts_list(1:3, data, "SmokeStatus")
}
get_identity <- function(data) {
format_counts_list(0:1, data, "Identity")
}
get_age <- function(data) {
c(
format_counts(sum(data$Age >= 18 & data$Age <= 49), nrow(data)),
format_counts(sum(data$Age > 50), nrow(data))
)
}
get_health_metrics <- function(data) {
format_mean_sd_list(data, c("BMI", "Systolic", "Diastolic", "AlcoholPercentage", "SodiumPotassiumRatio", "FibreEnergy", "CarbohydrateEnergy"))
}
riskdata <- selected_data %>% filter(Hypertension == 1)
noriskdata <- selected_data %>% filter(Hypertension == 0)
df <- data.frame(
Variable = c(
"Lowest 20%, N(%)", "Second quintile, N(%)", "Third quintile, N(%)", "Fourth quintile, N(%)", "Highest 20%*, N(%)",
"Male, N(%)", "Female, N(%)",
"Currently Smoker, N(%)", "Ex-Smoker, N(%)", "Never smoked, N(%)",
"Non Indigenous, N(%)", "Indigenous, N(%)",
"18-49, N(%)", ">50, N(%)",
"BMI, kg/m² , mean(SD)",
"Systolic, mmHg, mean(SD)", "Diastolic, mmHg, mean(SD)",
"Alcohol Percentage, mean(SD)",
"Sodium Potassium Ratio, mean(SD)",
"Fibre Energy, mean(SD)",
"Carbohydrate Energy, mean(SD)"
),
Group = c(
"SocialStatus", "SocialStatus", "SocialStatus", "SocialStatus", "SocialStatus",
"Gender", "Gender",
"SmokeStatus", "SmokeStatus", "SmokeStatus",
"Identity", "Identity",
"Age", "Age",
NA, NA, NA, NA, NA, NA, NA
),
Total = c(
get_social_status(selected_data),
get_gender(selected_data),
get_smoke_status(selected_data),
get_identity(selected_data),
get_age(selected_data),
get_health_metrics(selected_data)
),
HypertensionRisk = c(
get_social_status(riskdata),
get_gender(riskdata),
get_smoke_status(riskdata),
get_identity(riskdata),
get_age(riskdata),
get_health_metrics(riskdata)
),
NoHypertensionRisk = c(
get_social_status(noriskdata),
get_gender(noriskdata),
get_smoke_status(noriskdata),
get_identity(noriskdata),
get_age(noriskdata),
get_health_metrics(noriskdata)
)
)
total_count <- nrow(selected_data)
risk_count <- nrow(riskdata)
no_risk_count <- nrow(noriskdata)
df %>%
format_table(ci_brackets = c("(", ")")) %>%
export_table(format = "html", title = "Table 1: Demographic and Health Characteristics by Hypertension Status")
| Table 1: Demographic and Health Characteristics by Hypertension Status |
| Variable |
Total |
HypertensionRisk |
NoHypertensionRisk |
| SocialStatus |
| Lowest 20%, N(%) |
2164(27.3%) |
1079(28.7%) |
1085(26%) |
| Second quintile, N(%) |
1570(19.8%) |
769(20.4%) |
801(19.2%) |
| Third quintile, N(%) |
1430(18%) |
665(17.7%) |
765(18.3%) |
| Fourth quintile, N(%) |
1223(15.4%) |
572(15.2%) |
651(15.6%) |
| Highest 20%*, N(%) |
1545(19.5%) |
678(18%) |
867(20.8%) |
| Gender |
| Male, N(%) |
3639(45.9%) |
2001(53.2%) |
1638(39.3%) |
| Female, N(%) |
4293(54.1%) |
1762(46.8%) |
2531(60.7%) |
| SmokeStatus |
| Currently Smoker, N(%) |
2163(27.3%) |
1038(27.6%) |
1125(27%) |
| Ex-Smoker, N(%) |
2222(28%) |
1112(29.6%) |
1110(26.6%) |
| Never smoked, N(%) |
3547(44.7%) |
1613(42.9%) |
1934(46.4%) |
| Identity |
| Non Indigenous, N(%) |
6290(79.3%) |
2947(78.3%) |
3343(80.2%) |
| Indigenous, N(%) |
1642(20.7%) |
816(21.7%) |
826(19.8%) |
| Age |
| 18-49, N(%) |
5174(65.2%) |
2149(57.1%) |
3025(72.6%) |
| >50, N(%) |
2597(32.7%) |
1537(40.8%) |
1060(25.4%) |
|
| BMI, kg/m² , mean(SD) |
27.04(5.28) |
28.18(5.33) |
25.71(4.9) |
| Systolic, mmHg, mean(SD) |
120.49(15.92) |
130.47(13.58) |
107.74(7.21) |
| Diastolic, mmHg, mean(SD) |
77.26(9.73) |
82.73(8.91) |
70.27(5.2) |
| Alcohol Percentage, mean(SD) |
3.3(6.33) |
3.84(6.94) |
2.81(5.69) |
| Sodium Potassium Ratio, mean(SD) |
0.93(0.52) |
0.91(0.51) |
0.94(0.53) |
| Fibre Energy, mean(SD) |
1.62(0.92) |
1.61(0.92) |
1.64(0.93) |
| Carbohydrate Energy, mean(SD) |
32.57(13) |
32.42(13.06) |
32.71(12.96) |
|
Missing Values
selected_data %>%
dplyr::select(-PersonID, -HouseID) %>%
miss_var_summary() %>%
gt() %>%
tab_header(title = "Missing Value") %>%
tab_caption(caption = md("Missing Values in Selected Data")) %>%
cols_label(variable = "Variable Name", n_miss = "Missing Count", pct_miss = "Percentage Missing") %>%
fmt_number(columns = everything(), decimals = 2)
Missing Values in Selected Data
| Missing Value |
| Variable Name |
Missing Count |
Percentage Missing |
| BMI |
1,242.00 |
15.7 |
| Systolic |
1,222.00 |
15.4 |
| Diastolic |
1,222.00 |
15.4 |
| SocialStatus |
0.00 |
0 |
| Age |
0.00 |
0 |
| Gender |
0.00 |
0 |
| SmokeStatus |
0.00 |
0 |
| AlcoholPercentage |
0.00 |
0 |
| SodiumPotassiumRatio |
0.00 |
0 |
| FibreEnergy |
0.00 |
0 |
| CarbohydrateEnergy |
0.00 |
0 |
| Identity |
0.00 |
0 |
| Hypertension |
0.00 |
0 |
corr_matrix <- selected_data%>%
dplyr::select(where(is.numeric)) %>%
mutate(across(everything(), ~ ifelse(is.na(.), mean(., na.rm = TRUE), .))) %>%
cor(use = "complete.obs")
ggcorrplot(corr_matrix, lab = TRUE,
title = "Correlation Heatmap for ALL",
colors = c("blue", "white", "red"),
tl.cex = 9, lab_size = 4) + theme(legend.position = "none")

Regression Imputation
selected_data$BMI[is.na(selected_data$BMI)] <- lm(BMI ~ Gender + SocialStatus + Age + SodiumPotassiumRatio + FibreEnergy + CarbohydrateEnergy + AlcoholPercentage + SmokeStatus + Identity, data = selected_data, na.action = na.exclude) %>% predict(newdata = selected_data[is.na(selected_data$BMI), ])
selected_data$Systolic[is.na(selected_data$Systolic)] <- lm(Systolic ~ Gender + SocialStatus + Age + BMI + SodiumPotassiumRatio + FibreEnergy + CarbohydrateEnergy + AlcoholPercentage + SmokeStatus + Identity, data = selected_data, na.action = na.exclude) %>% predict(newdata = selected_data[is.na(selected_data$Systolic), ])
selected_data$Diastolic[is.na(selected_data$Diastolic)] <- lm(Diastolic ~ Gender + SocialStatus + Age + BMI + Systolic + SodiumPotassiumRatio + FibreEnergy + CarbohydrateEnergy + AlcoholPercentage + SmokeStatus + Identity, data = selected_data, na.action = na.exclude) %>% predict(newdata = selected_data[is.na(selected_data$Diastolic), ])
vis_miss(selected_data %>% dplyr::select(-PersonID, -HouseID)) +
ggtitle("Missing Data Visualization of Selected Variables")

Normalization
numeric_data <- selected_data %>% dplyr::select_if(is.numeric)
preprocess_params <- preProcess(numeric_data, method = c("center", "scale"))
scaled_numeric_data <- predict(preprocess_params, numeric_data)
normalized_data <- bind_cols(scaled_numeric_data, selected_data %>% dplyr::select_if(Negate(is.numeric)))
Multicollinearity
selected_data %>%
dplyr::select(where(is.numeric), -Systolic, -Diastolic) %>%
ggscatmat() +
ggtitle("Scatterplot Matrix of Selected Numeric Variables")

Correlation
Scatter plot
Fibre_syst_plot <- ggplot(selected_data, aes(x = FibreEnergy, y = Systolic)) +
geom_point(color = "#FF7F0E", alpha = 0.5) +
geom_smooth(method = "lm", se = FALSE, color = "#1F77B4", size = 1.5) +
labs(x = "Percentage of Energy from Fibre", y = "Systolic (mmHg)") +
theme_light(base_size = 15) +
theme(plot.title = element_text(hjust = 0.5)) +
annotate("text", x = Inf, y = Inf, label = "A", hjust = 1.5, vjust = 1.5, size = 6, fontface = "bold")
Fibre_dias_plot <- ggplot(selected_data, aes(x = FibreEnergy, y = Diastolic)) +
geom_point(color = "#FF7F0E", alpha = 0.5) +
geom_smooth(method = "lm", se = FALSE, color = "#1F77B4", size = 1.5) +
labs(x = "Percentage of Energy from Fibre", y = "Diastolic (mmHg)") +
theme_light(base_size = 15) +
theme(plot.title = element_text(hjust = 0.5)) +
annotate("text", x = Inf, y = Inf, label = "B", hjust = 1.5, vjust = 1.5, size = 6, fontface = "bold")
carbo_syst_plot <- ggplot(selected_data, aes(x = CarbohydrateEnergy, y = Systolic)) +
geom_point(color = "#2CA02C", alpha = 0.5) +
geom_smooth(method = "lm", se = FALSE, color = "#D62728", size = 1.5) +
labs(x = "Percentage of Energy from Carbohydrate", y = "Systolic (mmHg)") +
theme_light(base_size = 15) +
theme(plot.title = element_text(hjust = 0.5)) +
annotate("text", x = Inf, y = Inf, label = "C", hjust = 1.5, vjust = 1.5, size = 6, fontface = "bold")
carbo_dias_plot <- ggplot(selected_data, aes(x = CarbohydrateEnergy, y = Diastolic)) +
geom_point(color = "#2CA02C", alpha = 0.5) +
geom_smooth(method = "lm", se = FALSE, color = "#D62728", size = 1.5) +
labs(x = "Percentage of Energy from Carbohydrate", y = "Diastolic (mmHg)") +
theme_light(base_size = 15) +
theme(plot.title = element_text(hjust = 0.5)) +
annotate("text", x = Inf, y = Inf, label = "D", hjust = 1.5, vjust = 1.5, size = 6, fontface = "bold")
explanation <- textGrob(
label = "A. Scatter plot of Fibre Energy vs Systolic B. Scatter plot of Fibre Energy vs Diastolic\nC. Scatter plot of Carbohydrate Energy vs Systolic D. Scatter plot of Carbohydrate Energy vs Diastolic",
x = unit(0, "npc"), y = unit(0.1, "npc"), just = c("left", "bottom"),
gp = gpar(fontsize = 12, fontface = "italic")
)
grid.arrange(
arrangeGrob(
Fibre_syst_plot, Fibre_dias_plot, carbo_syst_plot, carbo_dias_plot,
nrow = 2,
top = textGrob("Figure 1: Relationship Between Energy Intake from Fibre and Carbohydrate and Blood Pressure",
gp = gpar(fontsize = 16))
),
explanation,
heights = unit.c(unit(1, "npc") - unit(3, "lines"), unit(3, "lines"))
)

Linear correlation table
Fibre_syst_pearson <- cor(selected_data$FibreEnergy, selected_data$Systolic, method = "pearson")
Fibre_syst_spearman <- cor(selected_data$FibreEnergy, selected_data$Systolic, method = "spearman")
Fibre_dias_pearson <- cor(selected_data$FibreEnergy, selected_data$Diastolic, method = "pearson")
Fibre_dias_spearman <- cor(selected_data$FibreEnergy, selected_data$Diastolic, method = "spearman")
carbo_syst_pearson <- cor(selected_data$CarbohydrateEnergy, selected_data$Systolic, method = "pearson")
carbo_syst_spearman <- cor(selected_data$CarbohydrateEnergy, selected_data$Systolic, method = "spearman")
carbo_dias_pearson <- cor(selected_data$CarbohydrateEnergy, selected_data$Diastolic, method = "pearson")
carbo_dias_spearman <- cor(selected_data$CarbohydrateEnergy, selected_data$Diastolic, method = "spearman")
correlation_table <- data.frame(
Measure = c("FibreEnergy vs Systolic", "FibreEnergy vs Diastolic",
"CarbohydrateEnergy vs Systolic", "CarbohydrateEnergy vs Diastolic"),
Pearson_Correlation = c(Fibre_syst_pearson, Fibre_dias_pearson, carbo_syst_pearson, carbo_dias_pearson),
Spearman_Correlation = c(Fibre_syst_spearman, Fibre_dias_spearman, carbo_syst_spearman, carbo_dias_spearman))
correlation_table %>%
gt() %>%
tab_header(
title = "Table 2: Correlation Between Nutrient Energy Intake and Blood Pressure",
subtitle = "Correlation values for both Systolic and Diastolic Blood Pressure") %>%
fmt_number(columns = c(Pearson_Correlation, Spearman_Correlation), decimals = 2)
| Table 2: Correlation Between Nutrient Energy Intake and Blood Pressure |
| Correlation values for both Systolic and Diastolic Blood Pressure |
| Measure |
Pearson_Correlation |
Spearman_Correlation |
| FibreEnergy vs Systolic |
0.02 |
0.02 |
| FibreEnergy vs Diastolic |
−0.09 |
−0.09 |
| CarbohydrateEnergy vs Systolic |
−0.04 |
−0.05 |
| CarbohydrateEnergy vs Diastolic |
−0.09 |
−0.10 |
Non-linear correlation table
Fibre_syst_kendall <- cor(selected_data$FibreEnergy, selected_data$Systolic, method = "kendall")
Fibre_syst_mic <- mine(selected_data$FibreEnergy, selected_data$Systolic)$MIC
Fibre_dias_kendall <- cor(selected_data$FibreEnergy, selected_data$Diastolic, method = "kendall")
Fibre_dias_mic <- mine(selected_data$FibreEnergy, selected_data$Diastolic)$MIC
carbo_syst_kendall <- cor(selected_data$CarbohydrateEnergy, selected_data$Systolic, method = "kendall")
carbo_syst_mic <- mine(selected_data$CarbohydrateEnergy, selected_data$Systolic)$MIC
carbo_dias_kendall <- cor(selected_data$CarbohydrateEnergy, selected_data$Diastolic, method = "kendall")
carbo_dias_mic <- mine(selected_data$CarbohydrateEnergy, selected_data$Diastolic)$MIC
correlation_table_mic_kendall <- data.frame(
Measure = c("FibreEnergy vs Systolic", "FibreEnergy vs Diastolic",
"CarbohydrateEnergy vs Systolic", "CarbohydrateEnergy vs Diastolic"),
Kendall_Correlation = c(Fibre_syst_kendall, Fibre_dias_kendall, carbo_syst_kendall, carbo_dias_kendall),
MIC_Correlation = c(Fibre_syst_mic, Fibre_dias_mic, carbo_syst_mic, carbo_dias_mic))
correlation_table_mic_kendall %>%
gt() %>%
tab_header(
title = "Table 3: Kendall's Tau and MIC Correlation between Nutrient Energy Intake and Blood Pressure") %>%
fmt_number(columns = c(Kendall_Correlation, MIC_Correlation), decimals = 3)
| Table 3: Kendall's Tau and MIC Correlation between Nutrient Energy Intake and Blood Pressure |
| Measure |
Kendall_Correlation |
MIC_Correlation |
| FibreEnergy vs Systolic |
0.010 |
0.044 |
| FibreEnergy vs Diastolic |
−0.063 |
0.058 |
| CarbohydrateEnergy vs Systolic |
−0.033 |
0.059 |
| CarbohydrateEnergy vs Diastolic |
−0.070 |
0.072 |
Hypothesis testing
median_Fibre <- median(selected_data$FibreEnergy)
selected_data$FibreGroup <- ifelse(selected_data$FibreEnergy > median_Fibre, "High Fibre", "Low Fibre")
median_carb <- median(selected_data$CarbohydrateEnergy)
selected_data$CarboGroup <- ifelse(selected_data$CarbohydrateEnergy > median_carb, "High Carbo", "Low Carbo")
Fibre_systolic_anova <- aov(Systolic ~ FibreGroup, data = selected_data)
Fibre_diastolic_anova <- aov(Diastolic ~ FibreGroup, data = selected_data)
carbo_systolic_anova <- aov(Systolic ~ CarboGroup, data = selected_data)
carbo_diastolic_anova <- aov(Diastolic ~ CarboGroup, data = selected_data)
anova_results <- data.frame(
Measure = c("Fibre vs Systolic", "Fibre vs Diastolic",
"Carbohydrate vs Systolic", "Carbohydrate vs Diastolic"),
F_Statistic = c(summary(Fibre_systolic_anova)[[1]][["F value"]][1],
summary(Fibre_diastolic_anova)[[1]][["F value"]][1],
summary(carbo_systolic_anova)[[1]][["F value"]][1],
summary(carbo_diastolic_anova)[[1]][["F value"]][1]),
P_Value = c(summary(Fibre_systolic_anova)[[1]][["Pr(>F)"]][1],
summary(Fibre_diastolic_anova)[[1]][["Pr(>F)"]][1],
summary(carbo_systolic_anova)[[1]][["Pr(>F)"]][1],
summary(carbo_diastolic_anova)[[1]][["Pr(>F)"]][1])
)
anova_results %>%
gt() %>%
tab_header(
title = "Table 4: ANOVA Test Results: Nutrient Energy Intake vs Blood Pressure"
) %>%
fmt_number(columns = c(F_Statistic, P_Value), decimals = 3) %>%
cols_label(
Measure = "Nutrient vs Blood Pressure",
F_Statistic = "F-Statistic",
P_Value = "P-Value"
) %>%
tab_options(
table.font.size = "medium",
heading.align = "center"
)
| Table 4: ANOVA Test Results: Nutrient Energy Intake vs Blood Pressure |
| Nutrient vs Blood Pressure |
F-Statistic |
P-Value |
| Fibre vs Systolic |
0.842 |
0.359 |
| Fibre vs Diastolic |
41.924 |
0.000 |
| Carbohydrate vs Systolic |
5.385 |
0.020 |
| Carbohydrate vs Diastolic |
52.412 |
0.000 |
Model comparison
Systolic
set.seed(3888)
k <- 5
repeats <- 3
gam_rmse_systolic <- numeric()
gam_mae_systolic <- numeric()
gam_r2_systolic <- numeric()
mlr_rmse_systolic <- numeric()
mlr_mae_systolic <- numeric()
mlr_r2_systolic <- numeric()
robust_rmse_systolic <- numeric()
robust_mae_systolic <- numeric()
robust_r2_systolic <- numeric()
n <- nrow(normalized_data)
for (r in 1:repeats) {
folds <- sample(rep(1:k, length.out = n))
for (i in 1:k) {
train_idx <- which(folds != i)
test_idx <- which(folds == i)
data_train <- normalized_data[train_idx, ]
data_test <- normalized_data[test_idx, ]
actual <- data_test$Systolic
#GAM
gam_model <- gam(Systolic ~ s(Age) + s(BMI) + SocialStatus + Gender + SmokeStatus +
s(AlcoholPercentage) + s(FibreEnergy) + s(CarbohydrateEnergy) +
Identity + s(SodiumPotassiumRatio),
data = data_train)
gam_predictions <- predict(gam_model, newdata = data_test)
gam_rmse <- sqrt(mean((gam_predictions - actual)^2))
gam_mae <- mean(abs(gam_predictions - actual))
gam_r2 <- cor(gam_predictions, actual)^2
gam_rmse_systolic <- c(gam_rmse_systolic, gam_rmse)
gam_mae_systolic <- c(gam_mae_systolic, gam_mae)
gam_r2_systolic <- c(gam_r2_systolic, gam_r2)
#Multiple Linear Regression
mlr_model <- lm(Systolic ~ Age + BMI + SocialStatus + Gender + SmokeStatus +
AlcoholPercentage + FibreEnergy + CarbohydrateEnergy +
Identity + SodiumPotassiumRatio, data = data_train)
mlr_predictions <- predict(mlr_model, newdata = data_test)
mlr_rmse <- sqrt(mean((mlr_predictions - actual)^2))
mlr_mae <- mean(abs(mlr_predictions - actual))
mlr_r2 <- cor(mlr_predictions, actual)^2
mlr_rmse_systolic <- c(mlr_rmse_systolic, mlr_rmse)
mlr_mae_systolic <- c(mlr_mae_systolic, mlr_mae)
mlr_r2_systolic <- c(mlr_r2_systolic, mlr_r2)
#Robust Regression
robust_model <- rlm(Systolic ~ Age + BMI + SocialStatus + Gender + SmokeStatus +
AlcoholPercentage + FibreEnergy + CarbohydrateEnergy +
Identity + SodiumPotassiumRatio, data = data_train)
robust_predictions <- predict(robust_model, newdata = data_test)
robust_rmse <- sqrt(mean((robust_predictions - actual)^2))
robust_mae <- mean(abs(robust_predictions - actual))
robust_r2 <- cor(robust_predictions, actual)^2
robust_rmse_systolic <- c(robust_rmse_systolic, robust_rmse)
robust_mae_systolic <- c(robust_mae_systolic, robust_mae)
robust_r2_systolic <- c(robust_r2_systolic, robust_r2)
}
}
mean_systolic_gam_rmse <- mean(gam_rmse_systolic)
mean_systolic_gam_mae <- mean(gam_mae_systolic)
mean_systolic_gam_r2 <- mean(gam_r2_systolic)
mean_systolic_mlr_rmse <- mean(mlr_rmse_systolic)
mean_systolic_mlr_mae <- mean(mlr_mae_systolic)
mean_systolic_mlr_r2 <- mean(mlr_r2_systolic)
mean_systolic_robust_rmse <- mean(robust_rmse_systolic)
mean_systolic_robust_mae <- mean(robust_mae_systolic)
mean_systolic_robust_r2 <- mean(robust_r2_systolic)
Systolic_df <- data.frame(
Model = c("GAM", "Multiple Linear", "Robust"),
RMSE = c(mean_systolic_gam_rmse, mean_systolic_mlr_rmse, mean_systolic_robust_rmse),
MAE = c(mean_systolic_gam_mae, mean_systolic_mlr_mae, mean_systolic_robust_mae),
R2 = c(mean_systolic_gam_r2, mean_systolic_mlr_r2, mean_systolic_robust_r2)
)
colors <- c('#FF5179', '#BF59D9', '#E76BF3')
rmse_range <- c(min(Systolic_df$RMSE) * 0.95, max(Systolic_df$RMSE) * 1.05)
mae_range <- c(min(Systolic_df$MAE) * 0.95, max(Systolic_df$MAE) * 1.05)
r2_range <- c(min(Systolic_df$R2) * 0.95, max(Systolic_df$R2) * 1.05)
Systolic_rmse <- plot_ly(Systolic_df, x = ~Model, y = ~round(RMSE, 3), type = 'bar', color = ~Model, colors = colors,
text = ~round(RMSE, 3), textposition = 'outside', textfont = list(color = 'black')) %>%
layout(
yaxis = list(title = 'RMSE', range = rmse_range),
showlegend = FALSE,
paper_bgcolor = 'rgba(245, 246, 249, 1)',
plot_bgcolor = 'rgba(0, 0, 0, 0)')
Systolic_mae <- plot_ly(Systolic_df, x = ~Model, y = ~round(MAE, 3), type = 'bar', color = ~Model, colors = colors,
text = ~round(MAE, 3), textposition = 'outside', textfont = list(color = 'black')) %>%
layout(
yaxis = list(title = 'MAE', range = mae_range),
showlegend = FALSE,
paper_bgcolor = 'rgba(245, 246, 249, 1)',
plot_bgcolor = 'rgba(0, 0, 0, 0)')
Systolic_r2 <- plot_ly(Systolic_df, x = ~Model, y = ~round(R2, 3), type = 'bar', color = ~Model, colors = colors,
text = ~round(R2, 3), textposition = 'outside', textfont = list(color = 'black')) %>%
layout(
yaxis = list(title = 'R²', range = r2_range),
showlegend = FALSE,
paper_bgcolor = 'rgba(245, 246, 249, 1)',
plot_bgcolor = 'rgba(0, 0, 0, 0)')
SystolicFig <- subplot(Systolic_rmse, Systolic_mae, Systolic_r2, nrows = 1, shareY = FALSE) %>%
layout(
title = list(
text = "Figure 2: Model Comparison (Systolic)",
font = list(size = 16),x = 0.5, y = 1.15),
annotations = list(
list(text = "A",x = 0, y = 1.05, xref = "paper", yref = "paper", showarrow = FALSE, font = list(size = 14, color = "black"), xanchor = "center"),
list(text = "B", x = 0.35, y = 1.05, xref = "paper", yref = "paper", showarrow = FALSE, font = list(size = 14, color = "black"), xanchor = "center"),
list(
text = "C",
x = 0.70,
y = 1.05,
xref = "paper",
yref = "paper",
showarrow = FALSE,
font = list(size = 14, color = "black"),
xanchor = "center"),
list(
text = "A. RMSE comparison for Generalized Additive Model, Multiple Linear Regression and Robust Regression",
x = 0,
y = -0.15,
xref = "paper",
yref = "paper",
showarrow = FALSE,
font = list(size = 12),
xanchor = "left"
),
list(
text = "B. MAE comparison for Generalized Additive Model, Multiple Linear Regression and Robust Regression",
x = 0,
y = -0.2,
xref = "paper",
yref = "paper",
showarrow = FALSE,
font = list(size = 12),
xanchor = "left"
),
list(
text = "C. R² comparison for Generalized Additive Model, Multiple Linear Regression and Robust Regression",
x = 0,
y = -0.25,
xref = "paper",
yref = "paper",
showarrow = FALSE,
font = list(size = 12),
xanchor = "left"
)
),
margin = list(b = 140, t = 80)
)
SystolicFig
Diastolic
set.seed(3888)
gam_rmse_diastolic <- numeric()
gam_mae_diastolic <- numeric()
gam_r2_diastolic <- numeric()
mlr_rmse_diastolic <- numeric()
mlr_mae_diastolic <- numeric()
mlr_r2_diastolic <- numeric()
robust_rmse_diastolic <- numeric()
robust_mae_diastolic <- numeric()
robust_r2_diastolic <- numeric()
for (r in 1:repeats) {
folds <- sample(rep(1:k, length.out = n))
for (i in 1:k) {
train_idx <- which(folds != i)
test_idx <- which(folds == i)
data_train <- normalized_data[train_idx, ]
data_test <- normalized_data[test_idx, ]
actual <- data_test$Diastolic
#GAM
gam_model <- gam(Diastolic ~ s(Age) + s(BMI) + SocialStatus + Gender + SmokeStatus +
s(AlcoholPercentage) + s(FibreEnergy) + s(CarbohydrateEnergy) +
Identity + s(SodiumPotassiumRatio),
data = data_train)
gam_predictions <- predict(gam_model, newdata = data_test)
gam_rmse <- sqrt(mean((gam_predictions - actual)^2))
gam_mae <- mean(abs(gam_predictions - actual))
gam_r2 <- cor(gam_predictions, actual)^2
gam_rmse_diastolic <- c(gam_rmse_diastolic, gam_rmse)
gam_mae_diastolic <- c(gam_mae_diastolic, gam_mae)
gam_r2_diastolic <- c(gam_r2_diastolic, gam_r2)
#Multiple Linear Regression
mlr_model <- lm(Diastolic ~ Age + BMI + SocialStatus + Gender + SmokeStatus +
AlcoholPercentage + FibreEnergy + CarbohydrateEnergy +
Identity + SodiumPotassiumRatio, data = data_train)
mlr_predictions <- predict(mlr_model, newdata = data_test)
mlr_rmse <- sqrt(mean((mlr_predictions - actual)^2))
mlr_mae <- mean(abs(mlr_predictions - actual))
mlr_r2 <- cor(mlr_predictions, actual)^2
mlr_rmse_diastolic <- c(mlr_rmse_diastolic, mlr_rmse)
mlr_mae_diastolic <- c(mlr_mae_diastolic, mlr_mae)
mlr_r2_diastolic <- c(mlr_r2_diastolic, mlr_r2)
#Robust Regression
robust_model <- rlm(Diastolic ~ Age + BMI + SocialStatus + Gender + SmokeStatus +
AlcoholPercentage + FibreEnergy + CarbohydrateEnergy +
Identity + SodiumPotassiumRatio, data = data_train)
robust_predictions <- predict(robust_model, newdata = data_test)
robust_rmse <- sqrt(mean((robust_predictions - actual)^2))
robust_mae <- mean(abs(robust_predictions - actual))
robust_r2 <- cor(robust_predictions, actual)^2
robust_rmse_diastolic <- c(robust_rmse_diastolic, robust_rmse)
robust_mae_diastolic <- c(robust_mae_diastolic, robust_mae)
robust_r2_diastolic <- c(robust_r2_diastolic, robust_r2)
}
}
mean_diastolic_gam_rmse <- mean(gam_rmse_diastolic)
mean_diastolic_gam_mae <- mean(gam_mae_diastolic)
mean_diastolic_gam_r2 <- mean(gam_r2_diastolic)
mean_diastolic_mlr_rmse <- mean(mlr_rmse_diastolic)
mean_diastolic_mlr_mae <- mean(mlr_mae_diastolic)
mean_diastolic_mlr_r2 <- mean(mlr_r2_diastolic)
mean_diastolic_robust_rmse <- mean(robust_rmse_diastolic)
mean_diastolic_robust_mae <- mean(robust_mae_diastolic)
mean_diastolic_robust_r2 <- mean(robust_r2_diastolic)
Diastolic_df <- data.frame(
Model = c("GAM", "Multiple Linear", "Robust"),
RMSE = c(mean_diastolic_gam_rmse, mean_diastolic_mlr_rmse, mean_diastolic_robust_rmse),
MAE = c(mean_diastolic_gam_mae, mean_diastolic_mlr_mae, mean_diastolic_robust_mae),
R2 = c(mean_diastolic_gam_r2, mean_diastolic_mlr_r2, mean_diastolic_robust_r2)
)
colors <- c('#FF5179', '#BF59D9', '#E76BF3')
rmse_range <- c(min(Diastolic_df$RMSE) * 0.95, max(Diastolic_df$RMSE) * 1.05)
mae_range <- c(min(Diastolic_df$MAE) * 0.95, max(Diastolic_df$MAE) * 1.05)
r2_range <- c(min(Diastolic_df$R2) * 0.95, max(Diastolic_df$R2) * 1.05)
Diastolic_rmse <- plot_ly(Diastolic_df, x = ~Model, y = ~round(RMSE, 3), type = 'bar', color = ~Model, colors = colors,
text = ~round(RMSE, 3), textposition = 'outside', textfont = list(color = 'black')) %>%
layout(
yaxis = list(title = 'RMSE', range = rmse_range),
showlegend = FALSE,
paper_bgcolor = 'rgba(245, 246, 249, 1)',
plot_bgcolor = 'rgba(0, 0, 0, 0)')
Diastolic_mae <- plot_ly(Diastolic_df, x = ~Model, y = ~round(MAE, 3), type = 'bar', color = ~Model, colors = colors,
text = ~round(MAE, 3), textposition = 'outside', textfont = list(color = 'black')) %>%
layout(
yaxis = list(title = 'MAE', range = mae_range),
showlegend = FALSE,
paper_bgcolor = 'rgba(245, 246, 249, 1)',
plot_bgcolor = 'rgba(0, 0, 0, 0)')
Diastolic_r2 <- plot_ly(Diastolic_df, x = ~Model, y = ~round(R2, 3), type = 'bar', color = ~Model, colors = colors,
text = ~round(R2, 3), textposition = 'outside', textfont = list(color = 'black')) %>%
layout(
yaxis = list(title = 'R²', range = r2_range),
showlegend = FALSE,
paper_bgcolor = 'rgba(245, 246, 249, 1)',
plot_bgcolor = 'rgba(0, 0, 0, 0)')
DiastolicFig <- subplot(
Diastolic_rmse,
Diastolic_mae,
Diastolic_r2,
nrows = 1,
shareY = FALSE
) %>%
layout(
title = list(
text = "Figure 3: Model Comparison (Diastolic)",
font = list(size = 16),x = 0.5, y = 1.15),
annotations = list(
list(text = "A",x = 0, y = 1.05, xref = "paper", yref = "paper", showarrow = FALSE, font = list(size = 14, color = "black"), xanchor = "center"),
list(text = "B", x = 0.35, y = 1.05, xref = "paper", yref = "paper", showarrow = FALSE, font = list(size = 14, color = "black"), xanchor = "center"),
list(
text = "C",
x = 0.70,
y = 1.05,
xref = "paper",
yref = "paper",
showarrow = FALSE,
font = list(size = 14, color = "black"),
xanchor = "center"),
list(
text = "A. RMSE comparison for Generalized Additive Model, Multiple Linear Regression and Robust Regression",
x = 0,
y = -0.15,
xref = "paper",
yref = "paper",
showarrow = FALSE,
font = list(size = 12),
xanchor = "left"
),
list(
text = "B. MAE comparison for Generalized Additive Model, Multiple Linear Regression and Robust Regression",
x = 0,
y = -0.2,
xref = "paper",
yref = "paper",
showarrow = FALSE,
font = list(size = 12),
xanchor = "left"
),
list(
text = "C. R² comparison for Generalized Additive Model, Multiple Linear Regression and Robust Regression",
x = 0,
y = -0.25,
xref = "paper",
yref = "paper",
showarrow = FALSE,
font = list(size = 12),
xanchor = "left"
)
),
margin = list(b = 140, t = 80)
)
DiastolicFig
Full model vs Reduced model
set.seed(3888)
k <- 5
repeats <- 3
gam_full_rmse_systolic <- numeric()
gam_full_mae_systolic <- numeric()
gam_full_r2_systolic <- numeric()
gam_reduced_rmse_systolic <- numeric()
gam_reduced_mae_systolic <- numeric()
gam_reduced_r2_systolic <- numeric()
n <- nrow(normalized_data)
for (r in 1:repeats) {
folds <- sample(rep(1:k, length.out = n))
for (i in 1:k) {
train_idx <- which(folds != i)
test_idx <- which(folds == i)
data_train <- normalized_data[train_idx, ]
data_test <- normalized_data[test_idx, ]
actual <- data_test$Systolic
#GAM full
gam_full_model <- gam(Systolic ~ s(Age) + s(BMI) + SocialStatus + Gender + SmokeStatus +
s(AlcoholPercentage) + s(FibreEnergy) + s(CarbohydrateEnergy) +
Identity + s(SodiumPotassiumRatio),
data = data_train)
gam_full_predictions <- predict(gam_full_model, newdata = data_test)
gam_full_rmse <- sqrt(mean((gam_full_predictions - actual)^2))
gam_full_mae <- mean(abs(gam_full_predictions - actual))
gam_full_r2 <- cor(gam_full_predictions, actual)^2
gam_full_rmse_systolic <- c(gam_full_rmse_systolic, gam_full_rmse)
gam_full_mae_systolic <- c(gam_full_mae_systolic, gam_full_mae)
gam_full_r2_systolic <- c(gam_full_r2_systolic, gam_full_r2)
#GAM reduced
gam_reduced_model <- gam(Systolic ~ s(Age) + s(BMI) + SocialStatus + Gender + SmokeStatus +
s(AlcoholPercentage) + Identity + s(SodiumPotassiumRatio), data = data_train)
gam_reduced_predictions <- predict(gam_reduced_model, newdata = data_test)
gam_reduced_rmse <- sqrt(mean((gam_reduced_predictions - actual)^2))
gam_reduced_mae <- mean(abs(gam_reduced_predictions - actual))
gam_reduced_r2 <- cor(gam_reduced_predictions, actual)^2
gam_reduced_rmse_systolic <- c(gam_reduced_rmse_systolic, gam_reduced_rmse)
gam_reduced_mae_systolic <- c(gam_reduced_mae_systolic, gam_reduced_mae)
gam_reduced_r2_systolic <- c(gam_reduced_r2_systolic, gam_reduced_r2)
}
}
mean_systolic_gam_full_rmse <- mean(gam_full_rmse_systolic)
mean_systolic_gam_full_mae <- mean(gam_full_mae_systolic)
mean_systolic_gam_full_r2 <- mean(gam_full_r2_systolic)
mean_systolic_gam_reduced_rmse <- mean(gam_reduced_rmse_systolic)
mean_systolic_gam_reduced_mae <- mean(gam_reduced_mae_systolic)
mean_systolic_gam_reduced_r2 <- mean(gam_reduced_r2_systolic)
Systolic_df <- data.frame(
Model = c("GAM Full", "GAM Reduced"),
RMSE = c(mean_systolic_gam_full_rmse, mean_systolic_gam_reduced_rmse),
MAE = c(mean_systolic_gam_full_mae, mean_systolic_gam_reduced_mae),
R2 = c(mean_systolic_gam_full_r2, mean_systolic_gam_reduced_r2)
)
rmse_range <- c(min(Systolic_df$RMSE) * 0.95, max(Systolic_df$RMSE) * 1.05)
mae_range <- c(min(Systolic_df$MAE) * 0.95, max(Systolic_df$MAE) * 1.05)
r2_range <- c(min(Systolic_df$R2) * 0.95, max(Systolic_df$R2) * 1.05)
Systolic_rmse <- plot_ly(Systolic_df, x = ~Model, y = ~round(RMSE, 3), type = 'bar', color = ~Model, colors = colors,
text = ~round(RMSE, 3), textposition = 'outside', textfont = list(color = 'black')) %>%
layout(
yaxis = list(title = 'RMSE', range = rmse_range),
showlegend = FALSE,
paper_bgcolor = 'rgba(245, 246, 249, 1)',
plot_bgcolor = 'rgba(0, 0, 0, 0)')
Systolic_mae <- plot_ly(Systolic_df, x = ~Model, y = ~round(MAE, 3), type = 'bar', color = ~Model, colors = colors,
text = ~round(MAE, 3), textposition = 'outside', textfont = list(color = 'black')) %>%
layout(
yaxis = list(title = 'MAE', range = mae_range),
showlegend = FALSE,
paper_bgcolor = 'rgba(245, 246, 249, 1)',
plot_bgcolor = 'rgba(0, 0, 0, 0)')
Systolic_r2 <- plot_ly(Systolic_df, x = ~Model, y = ~round(R2, 3), type = 'bar', color = ~Model, colors = colors,
text = ~round(R2, 3), textposition = 'outside', textfont = list(color = 'black')) %>%
layout(
yaxis = list(title = 'R²', range = r2_range),
showlegend = FALSE,
paper_bgcolor = 'rgba(245, 246, 249, 1)',
plot_bgcolor = 'rgba(0, 0, 0, 0)')
SystolicFig <- subplot(Systolic_rmse, Systolic_mae, Systolic_r2, nrows = 1, shareY = FALSE) %>%
layout(
title = list(
text = "Figure 4: GAM Full Model* and Reduced Model** Comparison (Systolic)",
font = list(size = 16),
x = 0.5,
y = 1.15
),
annotations = list(
list(
text = "A",
x = 0,
y = 1.05,
xref = "paper",
yref = "paper",
showarrow = FALSE,
font = list(size = 14, color = "black"),
xanchor = "center"
),
list(
text = "B",
x = 0.35,
y = 1.05,
xref = "paper",
yref = "paper",
showarrow = FALSE,
font = list(size = 14, color = "black"),
xanchor = "center"
),
list(
text = "C",
x = 0.70,
y = 1.05,
xref = "paper",
yref = "paper",
showarrow = FALSE,
font = list(size = 14, color = "black"),
xanchor = "center"
),
list(
text = "A. RMSE comparison for Generalized Additive Model Full Model* and Reduced Model** Comparison (Systolic)",
x = 0,
y = -0.15,
xref = "paper",
yref = "paper",
showarrow = FALSE,
font = list(size = 12),
xanchor = "left"
),
list(
text = "B. MAE comparison for Generalized Additive Model Full Model* and Reduced Model** Comparison (Systolic)",
x = 0,
y = -0.2,
xref = "paper",
yref = "paper",
showarrow = FALSE,
font = list(size = 12),
xanchor = "left"
),
list(
text = "C. R² comparison for Generalized Additive Model Full Model* and Reduced Model** Comparison (Systolic)",
x = 0,
y = -0.25,
xref = "paper",
yref = "paper",
showarrow = FALSE,
font = list(size = 12),
xanchor = "left"
), list(
text = "*Full Model: Model with all risk factors and carb and fiber",
x = 0,
y = -0.3,
xref = "paper",
yref = "paper",
showarrow = FALSE,
font = list(size = 10),
xanchor = "left"
), list(
text = "**Reduced Model: Model with only risk factors",
x = 0,
y = -0.35,
xref = "paper",
yref = "paper",
showarrow = FALSE,
font = list(size = 10),
xanchor = "left"
)
),
margin = list(b = 140, t = 80)
)
SystolicFig
set.seed(3888)
k <- 5
repeats <- 3
gam_full_rmse_diastolic <- numeric()
gam_full_mae_diastolic <- numeric()
gam_full_r2_diastolic <- numeric()
gam_reduced_rmse_diastolic <- numeric()
gam_reduced_mae_diastolic <- numeric()
gam_reduced_r2_diastolic <- numeric()
n <- nrow(normalized_data)
for (r in 1:repeats) {
folds <- sample(rep(1:k, length.out = n))
for (i in 1:k) {
train_idx <- which(folds != i)
test_idx <- which(folds == i)
data_train <- normalized_data[train_idx, ]
data_test <- normalized_data[test_idx, ]
actual <- data_test$Diastolic
#GAM full
gam_full_model <- gam(Diastolic ~ s(Age) + s(BMI) + SocialStatus + Gender + SmokeStatus +
s(AlcoholPercentage) + s(FibreEnergy) + s(CarbohydrateEnergy) +
Identity + s(SodiumPotassiumRatio),
data = data_train)
gam_full_predictions <- predict(gam_full_model, newdata = data_test)
gam_full_rmse <- sqrt(mean((gam_full_predictions - actual)^2))
gam_full_mae <- mean(abs(gam_full_predictions - actual))
gam_full_r2 <- cor(gam_full_predictions, actual)^2
gam_full_rmse_diastolic <- c(gam_full_rmse_diastolic, gam_full_rmse)
gam_full_mae_diastolic <- c(gam_full_mae_diastolic, gam_full_mae)
gam_full_r2_diastolic <- c(gam_full_r2_diastolic, gam_full_r2)
#GAM reduced
gam_reduced_model <- gam(Diastolic ~ s(Age) + s(BMI) + SocialStatus + Gender + SmokeStatus +
s(AlcoholPercentage) + Identity + s(SodiumPotassiumRatio), data = data_train)
gam_reduced_predictions <- predict(gam_reduced_model, newdata = data_test)
gam_reduced_rmse <- sqrt(mean((gam_reduced_predictions - actual)^2))
gam_reduced_mae <- mean(abs(gam_reduced_predictions - actual))
gam_reduced_r2 <- cor(gam_reduced_predictions, actual)^2
gam_reduced_rmse_diastolic <- c(gam_reduced_rmse_diastolic, gam_reduced_rmse)
gam_reduced_mae_diastolic <- c(gam_reduced_mae_diastolic, gam_reduced_mae)
gam_reduced_r2_diastolic <- c(gam_reduced_r2_diastolic, gam_reduced_r2)
}
}
mean_diastolic_gam_full_rmse <- mean(gam_full_rmse_diastolic)
mean_diastolic_gam_full_mae <- mean(gam_full_mae_diastolic)
mean_diastolic_gam_full_r2 <- mean(gam_full_r2_diastolic)
mean_diastolic_gam_reduced_rmse <- mean(gam_reduced_rmse_diastolic)
mean_diastolic_gam_reduced_mae <- mean(gam_reduced_mae_diastolic)
mean_diastolic_gam_reduced_r2 <- mean(gam_reduced_r2_diastolic)
Diastolic_df <- data.frame(
Model = c("GAM Full", "GAM Reduced"),
RMSE = c(mean_diastolic_gam_full_rmse, mean_diastolic_gam_reduced_rmse),
MAE = c(mean_diastolic_gam_full_mae, mean_diastolic_gam_reduced_mae),
R2 = c(mean_diastolic_gam_full_r2, mean_diastolic_gam_reduced_r2)
)
rmse_range <- c(min(Diastolic_df$RMSE) * 0.95, max(Diastolic_df$RMSE) * 1.05)
mae_range <- c(min(Diastolic_df$MAE) * 0.95, max(Diastolic_df$MAE) * 1.05)
r2_range <- c(min(Diastolic_df$R2) * 0.95, max(Diastolic_df$R2) * 1.05)
Diastolic_rmse <- plot_ly(Diastolic_df, x = ~Model, y = ~round(RMSE, 3), type = 'bar', color = ~Model, colors = colors,
text = ~round(RMSE, 3), textposition = 'outside', textfont = list(color = 'black')) %>%
layout(
yaxis = list(title = 'RMSE', range = rmse_range),
showlegend = FALSE,
paper_bgcolor = 'rgba(245, 246, 249, 1)',
plot_bgcolor = 'rgba(0, 0, 0, 0)')
Diastolic_mae <- plot_ly(Diastolic_df, x = ~Model, y = ~round(MAE, 3), type = 'bar', color = ~Model, colors = colors,
text = ~round(MAE, 3), textposition = 'outside', textfont = list(color = 'black')) %>%
layout(
yaxis = list(title = 'MAE', range = mae_range),
showlegend = FALSE,
paper_bgcolor = 'rgba(245, 246, 249, 1)',
plot_bgcolor = 'rgba(0, 0, 0, 0)')
Diastolic_r2 <- plot_ly(Diastolic_df, x = ~Model, y = ~round(R2, 3), type = 'bar', color = ~Model, colors = colors,
text = ~round(R2, 3), textposition = 'outside', textfont = list(color = 'black')) %>%
layout(
yaxis = list(title = 'R²', range = r2_range),
showlegend = FALSE,
paper_bgcolor = 'rgba(245, 246, 249, 1)',
plot_bgcolor = 'rgba(0, 0, 0, 0)')
DiastolicFig <- subplot(Diastolic_rmse, Diastolic_mae, Diastolic_r2, nrows = 1, shareY = FALSE) %>%
layout(title = list(
text = "Figure 5: GAM Full Model* and Reduced Model** Comparison (Diastolic)",font = list(size = 16),x = 0.5,y = 1.15),
annotations = list(
list(text = "A",x = 0,y = 1.05,xref = "paper",yref = "paper",showarrow = FALSE,font = list(size = 14, color = "black"),xanchor = "center"),
list(text = "B",x = 0.35, y = 1.05,xref = "paper",yref = "paper",showarrow = FALSE,font = list(size = 14, color = "black"),xanchor = "center"),
list(text = "C",x = 0.70, y = 1.05,xref = "paper",yref = "paper",showarrow = FALSE,font = list(size = 14, color = "black"),xanchor = "center"),
list(text = "A. RMSE comparison for Generalized Additive Model Full Model* and Reduced Model** Comparison (Diastolic)",x = 0,y = -0.15,xref = "paper",yref = "paper",showarrow = FALSE,font = list(size = 12),xanchor = "left"),
list(text = "B. MAE comparison for Generalized Additive Model Full Model* and Reduced Model** Comparison (Diastolic)",x = 0,y = -0.2,xref = "paper",yref = "paper",showarrow = FALSE,font = list(size = 12),xanchor = "left"),
list(
text = "C. R² comparison for Generalized Additive Model Full Model* and Reduced Model** Comparison (Diastolic)",
x = 0,
y = -0.25,
xref = "paper",
yref = "paper",
showarrow = FALSE,
font = list(size = 12),
xanchor = "left"
), list(
text = "*Full Model: Model with all risk factors and carb and fiber",
x = 0,
y = -0.3,
xref = "paper",
yref = "paper",
showarrow = FALSE,
font = list(size = 10),
xanchor = "left"
), list(
text = "**Reduced Model: Model with only risk factors",
x = 0,
y = -0.35,
xref = "paper",
yref = "paper",
showarrow = FALSE,
font = list(size = 10),
xanchor = "left"
)
),
margin = list(b = 140, t = 80)
)
DiastolicFig